/* * The MIT License (MIT) * * Copyright (c) 2015 Jakob Hendeß * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.xlrnet.metadict.engines.leo; import com.google.common.collect.ImmutableMap; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xlrnet.metadict.api.engine.SearchEngine; import org.xlrnet.metadict.api.exception.MetadictTechnicalException; import org.xlrnet.metadict.api.language.GrammaticalNumber; import org.xlrnet.metadict.api.language.GrammaticalTense; import org.xlrnet.metadict.api.language.Language; import org.xlrnet.metadict.api.query.*; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.time.ZoneOffset; import java.time.ZonedDateTime; import java.util.Iterator; import java.util.Map; /** * This is the implementation for the leo.org search engine for Metadict. It uses the internal REST-endpoint from * leo.org that is used for their AJAX-calls. */ public class LeoEngine implements SearchEngine { private static final Logger LOGGER = LoggerFactory.getLogger(LeoEngine.class); private static final Map<String, EntryType> ENTRY_TYPE_MAP = ImmutableMap.<String, EntryType>builder() .put("subst", EntryType.NOUN) .put("noun", EntryType.NOUN) .put("adjv", EntryType.OTHER_WORD) // Adjective or adverb .put("adverb", EntryType.ADVERB) .put("adjective", EntryType.ADJECTIVE) .put("verb", EntryType.VERB) .put("phrase", EntryType.PHRASE) .put("example", EntryType.EXAMPLE) // TODO: Pronouns? .build(); private static final Map<String, String> DEFAULT_QUERY_DATA = ImmutableMap.<String, String>builder() .put("tolerMode", "nof") .put("lang", "en") .put("rmWords", "off") .put("rmSearch", "on") .put("searchLoc", "0") .put("resultOrder", "basic") .put("multiwordShowSingle", "on") .put("sectLenMax", "16") .put("n", "1").build(); private static final String SECTION_NAME_ATTRIBUTE = "sctName"; /** * Strips various kinds of whitespace at the beginning and at the end of the input string and none-blank characters * from the middle of the string. * * @param str * The input string. * @return Stripped string. */ private static String cleanWhitespace(String str) { StringUtils.replaceChars(str, "\u00A0\n\t\r", ""); return StringUtils.strip(str, " \u00A0\n\t\r"); } @NotNull @Override public BilingualQueryResult executeBilingualQuery(@NotNull String queryInput, @NotNull Language inputLanguage, @NotNull Language outputLanguage, boolean allowBothWay) throws MetadictTechnicalException { Connection targetConnection = buildTargetConnection(queryInput, inputLanguage, outputLanguage); Document doc; try { doc = targetConnection.get(); } catch (IOException e) { LOGGER.error("Fetching response from backend failed", e); throw new MetadictTechnicalException(e); } BilingualQueryResultBuilder builder = processDocument(doc); return builder.build(); } /** * Try to extract the plural form if the automatic detection has failed. * <p> * Example: "house - pl.: houses" should return "houses" * * @param inputString * The input string. * @return the plural form or null if nothing could be found */ private String alternativeExtractPluralString(String inputString) { int pluralIndex = StringUtils.indexOfIgnoreCase(inputString, "pl.:"); if (pluralIndex < 0) { return null; } String pluralSubstring = StringUtils.substring(inputString, pluralIndex + 4); String substringTrim = StringUtils.substringBefore(pluralSubstring, "-"); return cleanWhitespace(substringTrim); } private Connection buildTargetConnection(String searchString, Language inputLanguage, Language outputLanguage) { String targetDictionary = resolveDictionaryConfig(inputLanguage, outputLanguage); if (targetDictionary == null) { targetDictionary = resolveDictionaryConfig(outputLanguage, inputLanguage); if (targetDictionary == null) { throw new IllegalArgumentException("No suitable dictionary configuration found - this might be an internal metadict error"); } } return Jsoup.connect("https://dict.leo.org/dictQuery/m-vocab/" + targetDictionary + "/query.xml") .userAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36") .data(DEFAULT_QUERY_DATA) .data("lp", targetDictionary) .data("search", searchString) .data("t", ZonedDateTime.now(ZoneOffset.UTC).toString()); } /** * Extracts the abbreviation information from a given representation string. Tries to use "Abk." and "abbr." for * detecting abbreviations. * <p> * Example: If the input is "zum Beispiel [Abk.: z. B.]", then the abbreviation is "z. B.". * * @param representation * The input string. * @return the domain string or null if none could be found */ @Nullable private String extractAbbreviationString(String representation) { String abbreviation = StringUtils.substringBetween(representation, "[abbr.:", "]"); if (abbreviation == null) { abbreviation = StringUtils.substringBetween(representation, "[Abk.:", "]"); } if (abbreviation != null) { return cleanWhitespace(abbreviation); } return null; } /** * Extracts the domain information from a given representation string. * <p> * Example: If the input is "drive-in restaurant [cook.]", then the domain is "cook." * * @param representation * The input string. * @return the domain string or null if none could be found */ @Nullable private String extractDomainString(String representation) { String substring = StringUtils.substringAfterLast(representation, "["); if (substring != null) { String domain = StringUtils.substringBefore(substring, "]"); // Check if the extracted domain string is not an abbreviation if (!(StringUtils.endsWith(".", domain) && (StringUtils.startsWithIgnoreCase("abbr.:", domain) || StringUtils.startsWithIgnoreCase("abk.:", domain)))) { return domain; } } return null; } /** * Try to extract the best fitting general word form from an Elements object. If there are more than one general * form, the first one that contains parentheses or a dot (".") will be returned. If none contains parentheses, the * first element will be returned. * * @param side * An Elements object of {@code <word>}-Tags * @return The general form of the word. */ private String extractGeneralForm(Element side) { Elements elements = side.getElementsByTag("word"); // TODO: Try to detect the correct form with "(sth.)" -> code below is not completely working (!) /*if (elements.size() > 1) { for (Element element : elements) { String elementText = element.text(); if (elementText.length() > elements.first().text().length()) return elementText; } }*/ return elements.first().text(); } private BilingualQueryResultBuilder processDocument(Document doc) { BilingualQueryResultBuilder resultBuilder = ImmutableBilingualQueryResult.builder(); // Find sections: Elements sections = doc.getElementsByTag("section"); // Process sections: sections.stream().parallel().forEach(s -> processSection(s, resultBuilder)); // Find similarities: Element similarityNode = doc.getElementsByTag("similar").first(); // Process similarities: processSimilarities(similarityNode, resultBuilder); // Find external contents: Element forumLinkNode = doc.getElementsByTag("forumRef").first(); // Process external contents: processForumLinks(forumLinkNode, resultBuilder); return resultBuilder; } /** * Process all links to the leo.org forums and provide them as external content. * * @param forumLinkNode * The node which contains the forum link. * @param resultBuilder * The builder for the bilingual query result. */ private void processForumLinks(@Nullable Element forumLinkNode, @NotNull BilingualQueryResultBuilder resultBuilder) { if (forumLinkNode == null) { LOGGER.warn("Couldn't find forum link node"); return; } ExternalContentBuilder builder = ImmutableExternalContent.builder(); // TODO: Rewrite this for loop - it seems to be very strangely written (but works) for (Iterator<Element> iterator = forumLinkNode.getAllElements().iterator(); iterator.hasNext(); ) { Element linkNode = iterator.next(); if ("link".equals(linkNode.tag().getName())) { builder = ImmutableExternalContent.builder(); String link = linkNode.attr("href"); if (StringUtils.isNotBlank(link)) { try { builder.setLink(new URL("https://dict.leo.org/" + link)); linkNode = iterator.next(); } catch (MalformedURLException e) { LOGGER.warn("Illegal URL for forum entry", e); } } else { LOGGER.trace("Skipping link node with empty href attribute"); continue; } } if ("subject".equals(linkNode.tag().getName())) { String subject = linkNode.text(); linkNode = iterator.next(); if (StringUtils.isNotBlank(subject)) { builder.setTitle("leo.org forum: " + subject); } else { LOGGER.trace("Skipping blank subject node"); continue; } } if (StringUtils.equals(linkNode.tag().getName(), "teaser")) { String teaser = linkNode.text(); if (StringUtils.isNotBlank(teaser)) { builder.setDescription(teaser); resultBuilder.addExternalContent(builder.build()); } else { LOGGER.warn("Skipping blank teaser node"); } } } } /** * Process the content contents of a single entry node. The entry node is the root-node for a single dictionary * entry. */ private void processEntryNode(@NotNull Element entryNode, @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull EntryType fallbackEntryType) { // Try to determine the entry type again EntryType entryType = fallbackEntryType; Element category = entryNode.getElementsByTag("category").first(); if (category != null) { entryType = resolveSectionType(category.attr("type")); if (entryType == EntryType.UNKNOWN) { entryType = fallbackEntryType; } } // Process each side separately Elements sideNodes = entryNode.getElementsByTag("side"); Element leftSide = sideNodes.get(0); Element rightSide = sideNodes.get(1); DictionaryObject leftObject = processSideNode(leftSide, entryType); DictionaryObject rightObject = processSideNode(rightSide, entryType); // Build the final DictionaryEntry resultBuilder.addBilingualEntry(ImmutableBilingualEntry.builder() .setEntryType(entryType) .setInputObject(leftObject) .setOutputObject(rightObject).build()); } private void processSection(Element sectionNode, BilingualQueryResultBuilder resultBuilder) { String sectionType = sectionNode.attr(SECTION_NAME_ATTRIBUTE); EntryType fallbackEntryType = resolveSectionType(sectionType); for (Element entryNode : sectionNode.getElementsByTag("entry")) { processEntryNode(entryNode, resultBuilder, fallbackEntryType); } } private DictionaryObject processSideNode(Element side, EntryType entryType) { DictionaryObjectBuilder dictionaryObjectBuilder = ImmutableDictionaryObject.builder(); // Extract general form: String generalForm = cleanWhitespace(extractGeneralForm(side)); // Extract language: String languageIdentifier = side.attr("lang"); languageIdentifier = fixLanguageIdentifier(languageIdentifier); Language language = Language.getExistingLanguageById(languageIdentifier); final String[] pluralForm = new String[1]; // Workaround since objects inside lambda should be final // Extract description and plural form: side.getElementsByTag("small") .stream() .filter(e -> !StringUtils.startsWith(e.text(), "|")) // Filter verb tenses! .forEach(element -> { String elementText = element.text(); String elementHtml = element.outerHtml(); if (StringUtils.startsWithIgnoreCase(elementText, "pl.:")) { pluralForm[0] = StringUtils.substringAfter(elementText, ".:"); if (StringUtils.isNotBlank(pluralForm[0])) { dictionaryObjectBuilder.setAdditionalForm(GrammaticalNumber.PLURAL, cleanWhitespace(pluralForm[0])); } } else if (isValidDescriptionHtml(elementHtml)) { elementText = StringUtils.strip(elementText, "-"); dictionaryObjectBuilder.setDescription(cleanWhitespace(elementText)); } }); String fullRepresentation = side.getElementsByTag("repr").text(); // Test for domain specific content: String domain = extractDomainString(fullRepresentation); if (StringUtils.isNotBlank(domain)) { dictionaryObjectBuilder.setDomain(domain); } // Test for abbreviation String abbreviation = extractAbbreviationString(fullRepresentation); if (StringUtils.isNotBlank(abbreviation)) { dictionaryObjectBuilder.setAbbreviation(abbreviation); } // Try to detect alternative plural form: if (pluralForm[0] == null) { pluralForm[0] = alternativeExtractPluralString(fullRepresentation); if (StringUtils.isNotBlank(pluralForm[0])) { dictionaryObjectBuilder.setAdditionalForm(GrammaticalNumber.PLURAL, pluralForm[0]); } } // Process additional forms (e.g. verb tenses): String additionalFormText = side.getElementsByTag("repr").get(0).getElementsByTag("small").text(); processTenses(entryType, dictionaryObjectBuilder, language, additionalFormText); return dictionaryObjectBuilder .setGeneralForm(generalForm) .setLanguage(language) .build(); } private String fixLanguageIdentifier(String languageIdentifier) { return "ch".equals(languageIdentifier) ? "cn" : languageIdentifier; } private boolean isValidDescriptionHtml(String elementHtml) { return StringUtils.startsWithIgnoreCase(elementHtml, "<small><i>") && StringUtils.endsWith(elementHtml, "</i></small>") && !StringUtils.containsIgnoreCase(elementHtml, ".:") && !StringUtils.containsIgnoreCase(elementHtml, ".]") && !StringUtils.containsIgnoreCase(elementHtml, "auch:") && !StringUtils.containsIgnoreCase(elementHtml, "also:"); } private void processSimilarities(@Nullable Element similarityNode, @NotNull EngineQueryResultBuilder engineQueryResultBuilder) { if (similarityNode == null) { LOGGER.warn("Couldn't find similarity node"); return; } Elements sides = similarityNode.getElementsByTag("side"); for (Element side : sides) { String lang = side.attr("lang"); lang = fixLanguageIdentifier(lang); Language sideLanguage = Language.getExistingLanguageById(lang); for (Element word : side.getElementsByTag("word")) { String wordText = cleanWhitespace(word.text()); engineQueryResultBuilder.addSimilarRecommendation( ImmutableDictionaryObject.builder() .setLanguage(sideLanguage) .setGeneralForm(wordText) .build() ); } } } private void processTenses(EntryType entryType, DictionaryObjectBuilder dictionaryObjectBuilder, Language language, String representation) { // Try to extract verb tenses in english and german dictionary: if (entryType == EntryType.VERB && (Language.ENGLISH.equals(language) || Language.GERMAN.equals(language))) { String tensesString = StringUtils.substringBetween(representation, "|", "|"); if (tensesString != null) { String[] tensesArray = StringUtils.split(tensesString, ","); if (tensesArray.length != 2) { LOGGER.warn("Tenses array {} has unexpected length {} instead of 2", tensesArray, tensesArray.length); } dictionaryObjectBuilder.setAdditionalForm(GrammaticalTense.PAST_TENSE, cleanWhitespace(tensesArray[0])); if (tensesArray.length >= 2) { dictionaryObjectBuilder.setAdditionalForm(GrammaticalTense.PAST_PERFECT, cleanWhitespace(tensesArray[1])); } } } } /** * Resolve the internal query configuration for the leo.org backend. Currently supported: <ul> <li>German - * English</li> <li>German - French</li> <li>German - Spanish</li> <li>German - Italian</li> <li>German - * Chinese</li> <li>German - Russian</li> </ul> */ private String resolveDictionaryConfig(Language inputLanguage, Language outputLanguage) { switch (inputLanguage.getIdentifier()) { case "de": switch (outputLanguage.getIdentifier()) { case "en": return "ende"; case "fr": return "frde"; case "es": return "esde"; case "it": return "itde"; case "cn": return "chde"; case "ru": return "rude"; } break; case "en": switch (outputLanguage.getIdentifier()) { case "de": return "ende"; } break; case "fr": switch (outputLanguage.getIdentifier()) { case "de": return "frde"; } break; case "es": switch (outputLanguage.getIdentifier()) { case "de": return "esde"; } break; case "it": switch (outputLanguage.getIdentifier()) { case "de": return "itde"; } break; case "cn": switch (outputLanguage.getIdentifier()) { case "de": return "chde"; } break; case "ru": switch (outputLanguage.getIdentifier()) { case "de": return "rude"; } } LOGGER.warn("Unknown language configuration: {} - {}", inputLanguage, outputLanguage); return null; } private EntryType resolveSectionType(String sectionType) { return ENTRY_TYPE_MAP.getOrDefault(sectionType, EntryType.UNKNOWN); } }